Summary

A documented analysis of the 2022 algea bloom in the river Oder based on remote sensing estimates of chlorophyll concentrations.

The idea of the script is the following:

Analysis

library(tidyverse)
library(sf)
library(sfnetworks)
library(tidygraph)
library(igraph)
library(lwgeom)
library(viridis)
library(lubridate)

River network and distances to the mouth

Read the river network data and select rivers of interest

Read in the INSPIRE data and extract all the information we need.

inspire_data_path <- "data/RWB_2016_ManagementRestrictionOrRegulationZone_2020_L.gml+xml"
river_net_full <- st_read(inspire_data_path)
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: Field with same name (identifier) already exists in
## (ManagementRestrictionOrRegulationZone). Skipping newer ones
## Reading layer `ManagementRestrictionOrRegulationZone' from data source 
##   `/home/gkraemer/progs/R/oder_algae_bloom_paper/koehler_et_al_prymnesium_bloom/data/RWB_2016_ManagementRestrictionOrRegulationZone_2020_L.gml+xml' 
##   using driver `GML'
## Simple feature collection with 4586 features and 23 fields
## Geometry type: MULTILINESTRING
## Dimension:     XY
## Bounding box:  xmin: 14.12266 ymin: 49.00253 xmax: 24.1457 ymax: 54.83366
## Geodetic CRS:  ETRS89
river_net_full <- select(river_net_full, gml_id, text, geometry)

The following rivers are potentially of interest (Oder and tributaries) We give the names in Polish exactly as in the INSPIRE file and in parenthesis the English and German names

  • ODRA (Oder, Oder)
  • NYSA KŁODZKA (Eastern Neisse, Glatzer Neiße)
  • NYSA ŁUŻYCKA (Lusatian Neiss, Lausitzer Neiße)
  • WARTA (Warta, Warthe)
  • KANAŁ GLIWICKI Z KŁODNICĄ OD KOZŁÓWKI DO DRAMY
  • KANAŁ GLIWICKI (Gleiwitzer Kanal)
river_filter_terms <- c(
  "ODRA",
  "NYSA KŁODZKA",
  "NYSA ŁUŻYCKA",
  "NYSA ŁUZYCKA",
  "WARTA",
  "ZB. PORAJ",
  "KANAŁ GLIWICKI",
  "KŁODNICA"
)

river_filter_idx <- lapply(river_filter_terms, grep, x = river_net_full$text)
river_filter_idx <- do.call(c, river_filter_idx)

river_net <- river_net_full[river_filter_idx, ]

#### This is the missing piece for the WARTA, we add it manually and get the
#### number of components from 2 to 1. Coordinates are extracted manually using
#### qgis
warta_missing_piece <- matrix(c(19.37629795, 50.59313292,
                                19.37595002, 50.59261401), 2, 2, byrow = TRUE) %>%
  list %>%
  st_multilinestring %>%
  st_sfc(crs = st_crs(river_net)) %>%
  st_sf

river_net <- bind_rows(river_net, warta_missing_piece)

ggplot(river_net) + geom_sf() + theme_bw()

Transform rivers of interest to network

In order to calculate the distance of each point to the mouth of the river we need to transform our river network into a connected graph.

The sfnetwork class allows us the retain the geographical information of the graph.

# The data we read in come as MULTILINESTRING
river_net_linestring <- st_cast(river_net, "LINESTRING")
## Warning in st_cast.sf(river_net, "LINESTRING"): repeating attributes for all
## sub-geometries for which they may not be constant
river_net_point <- st_cast(river_net_linestring, "POINT")
## Warning in st_cast.sf(river_net_linestring, "POINT"): repeating attributes for
## all sub-geometries for which they may not be constant
# as_sfnetwork only uses enpoints as nodes, here we cut our data up so that each
# linestring consists of only two points and therefore every point becomes an
# endpoint
river_net_collections <- lwgeom::st_split(river_net_linestring, river_net_point)

# here each geometry is a GEOMETRYCOLLECTION of many LINESTRING, we need to
# separate them.

river_net_rows <- list()
for (i in 1:nrow(river_net_collections)) {
  river_net_collection <- river_net_collections[i, ]
  ### using c(...) makes the whole thing flat and not nested
  river_net_rows <- c(
    river_net_rows,
    lapply(river_net_collection$geometry[[1]], function(x) {
      river_net_collection$geometry <- st_sfc(x)
      river_net_collection
    }))
}


### a list of length 51000
length(river_net_rows)
## [1] 51218
### takes ~20s
## rows <- bind_rows(rows)
### this is a much faster version and should give the same result
river_net_rows_gml_id <- vapply(river_net_rows, function(x) x$gml_id, "")
river_net_rows_text <- vapply(river_net_rows, function(x) x$text, "")
## sapply doesn't work here...
river_net_rows_geometry <- lapply(river_net_rows, function(x) x$geometry)
river_net_rows_geometry <- do.call(c, river_net_rows_geometry)
st_crs(river_net_rows_geometry) <- st_crs(river_net)
river_net_rows <- st_sf(gml_id = river_net_rows_gml_id,
                        text = river_net_rows_text,
                        geometry = river_net_rows_geometry,
                        crs = st_crs(river_net))

### a sf object with 51000 LINESTRING
river_net_rows
## Simple feature collection with 51218 features and 2 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: 14.12266 ymin: 49.91582 xmax: 19.49394 ymax: 53.60762
## Geodetic CRS:  ETRS89
## First 10 features:
##                gml_id                                    text
## 1  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 2  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 3  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 4  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 5  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 6  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 7  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 8  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 9  RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
## 10 RWB_PLRW6000011513 ODRA OD OLZY DO WYPŁYWU Z POLDERU BUKÓW
##                          geometry
## 1  LINESTRING (18.28843 49.993...
## 2  LINESTRING (18.33314 49.948...
## 3  LINESTRING (18.33312 49.949...
## 4  LINESTRING (18.33307 49.949...
## 5  LINESTRING (18.33297 49.949...
## 6  LINESTRING (18.33195 49.950...
## 7  LINESTRING (18.33141 49.951...
## 8  LINESTRING (18.3309 49.9515...
## 9  LINESTRING (18.33041 49.951...
## 10 LINESTRING (18.33021 49.952...
### Now we create the actual network and add the length of each segment as
### weights for a later distance calculation.
river_sfnet = as_sfnetwork(river_net_rows, directed = FALSE) %>%
  activate("edges") %>%
  mutate(weight = edge_length())

### Sanity check, this must be a single component!
stopifnot(components(river_sfnet)$no == 1)
stopifnot(is_connected(river_sfnet))

### We do not make any plots here because plotting becomes really slow due to
### the many tiny LINESTRING objects

Distance calculations

We add the distance to the river mouth as a property to each vertex of the graph

### choose the northern most point as the mouth of the river
idx_river_mouth <- river_sfnet %>%
  activate("nodes") %>%
  st_coordinates %>% {
    .[, 2]
  } %>%
  which.max

### distances from the mouth of the river to all points
distance_from_mouth <- igraph::distances(river_sfnet, idx_river_mouth) %>%
  as.vector
V(river_sfnet)$distance_from_mouth <- distance_from_mouth

### Sanity plot
river_sfnet %>%
  activate("nodes") %>% {
    coords <- st_coordinates(.)
    tibble(x = coords[, "X"], y = coords[, "Y"],
           distance_from_mouth = distance_from_mouth)
  } %>%
  ggplot() +
  aes(x = x, y = y, color = distance_from_mouth) +
  geom_point()

Match chlorophyll observations to the river network

Read chlorophyll concentrations

These data have been generated by Kerstin from Brockmann Consult using the coordinates from the same INSPIRE data. The data are organized conceptually as tidy data with one observation by location and time.

# Data from Brockmann Consult
chlorophyll_data_path <- "data/pixel_extraction_Oder_20220701_20220818_full_consolidated.txt"

# Note that we did a manual intervention removing white spaces from river station names

# read into tibble
# note that we defined column types manually as read_table makes wrong guesses
chlorophyll_data <- read_table(chlorophyll_data_path,
                               col_types = list(
                                 Name = col_character(),
                                 Date = col_date(format = ""),
                                 Longitude = col_double(),
                                 Latitude = col_double(),
                                 Chlorophyll_mean = col_double(),
                                 Chlorophyll_sigma = col_double(),
                                 num_passes = col_double()),
                               na = "NaN",
                               skip = 0,
                               comment = "")


# we only care about locations where we do have valid chl observations and skip the rest
chlorophyll_data <- chlorophyll_data %>%
  drop_na(Chlorophyll_mean) %>%
  rename(X = Longitude, Y = Latitude)

chlorophyll_data_sf <- st_as_sf(chlorophyll_data, coords = c("X", "Y"))
st_crs(chlorophyll_data_sf) <- st_crs(river_net)

ggplot() +
  geom_sf(data = river_net) +
  geom_point(data = chlorophyll_data,
             aes(x = X, y = Y, color = Chlorophyll_mean))

Match distance to mouth to our chlorophyll data

### Get the index of the nearest point in the river network to the chlorophyll
### data
nearest_vertex_idx <- river_sfnet %>%
  V %>%
  `$`(geometry) %>%
  st_nearest_feature(chlorophyll_data_sf, .)

### We take the distance to the river mouth from the river network an put it
### into the chlorophyll data
chlorophyll_data_sf$distance_from_mouth <-
  V(river_sfnet)$distance_from_mouth[nearest_vertex_idx]

Add cities and towns

We add some cities to make the maps easier to read. Data taken from Open Street Map.

oder_cities <- st_read("data/oder_cities.geojson") %>%
  st_transform(st_crs(river_sfnet)) %>%
  select(name)
## Reading layer `oder_cities' from data source 
##   `/home/gkraemer/progs/R/oder_algae_bloom_paper/koehler_et_al_prymnesium_bloom/data/oder_cities.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 5 features and 365 fields
## Geometry type: MULTIPOINT
## Dimension:     XY
## Bounding box:  xmin: 14.55096 ymin: 49.83491 xmax: 18.28201 ymax: 53.43018
## Geodetic CRS:  WGS 84
oder_towns <- st_read("data/oder_towns.geojson") %>%
  st_transform(st_crs(river_sfnet)) %>%
  select(name)
## Reading layer `oder_towns' from data source 
##   `/home/gkraemer/progs/R/oder_algae_bloom_paper/koehler_et_al_prymnesium_bloom/data/oder_towns.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 78 features and 193 fields
## Geometry type: MULTIPOINT
## Dimension:     XY
## Bounding box:  xmin: 14.04517 ymin: 49.59488 xmax: 18.43093 ymax: 53.65145
## Geodetic CRS:  WGS 84
oder_cities <- oder_cities %>%
  filter(name != "Ostrava")

oder_towns <- oder_towns %>%
  filter(name == "Frankfurt (Oder)")

oder_cities <-
  rbind(oder_cities, oder_towns)


nearest_vertex_idx <- river_sfnet %>%
  V %>%
  `$`(geometry) %>%
  st_nearest_feature(oder_cities, .)

oder_cities$distance_from_mouth <-
  V(river_sfnet)$distance_from_mouth[nearest_vertex_idx]

nearest_vertex_idx <- river_sfnet %>%
  V %>%
  `$`(geometry) %>%
  st_nearest_feature(oder_towns, .)

oder_towns$distance_from_mouth <-
  V(river_sfnet)$distance_from_mouth[nearest_vertex_idx]

oder_towns$distance_from_mouth
## [1] 179275.2
ggplot() +
  geom_sf(data = river_net) +
  geom_sf(data = oder_cities, aes(color = distance_from_mouth))

  ## geom_sf(data = oder_towns, aes(color = distance_from_mouth))

Chlorophyll plots

Chlorophyll vs. Distance to mouth

Plot the chlorophyll concentration in function to the mouth of the Oder by time slice.

library(ggspatial)

datecuts <- c("2022-07-01", "2022-07-21", "2022-08-01",
              "2022-08-05", "2022-08-09", "2022-08-16",
              "2022-08-19") %>%
  as.Date

startdate <- chlorophyll_data_sf$Date %>% min
enddate <- chlorophyll_data_sf$Date %>% max


fmt <- function(x) format(x, "%d/%m")
datecutlabels <- paste(fmt(datecuts[-length(datecuts)]),
                       fmt(datecuts[-1] - as.difftime("24:00:00")),
                       sep = "–")

chlorophyll_data_sf %>%
  mutate(date_cut = cut(Date, datecuts, labels = datecutlabels)) %>%
  ggplot() +
  aes(x = distance_from_mouth / 1000, y = Chlorophyll_mean) +
  geom_point(color = "#31a354") +
  geom_smooth(span = 0.2, color = "#fc8d62", fill = NA) +
  geom_vline(xintercept = oder_cities$distance_from_mouth / 1000) +
  geom_text(data = oder_cities, aes(x = distance_from_mouth / 1000, y = Inf, label = name),
            vjust = 1.1, hjust = 0, angle = -90) +
  ylim(0, 400) +
  scale_x_reverse() +
  labs(y = expression("Chlorophyll concentration [" * mu * "g/l]"),
       x = "Distance to river mouth [km]") +
  facet_wrap(vars(date_cut), nrow = 4) +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 9 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).

Hexbin plot

Just plotting chlorophyll concentrations for each pixel will result in overlapping pixels, therefore we need to bin the data before plotting.

cddf <- chlorophyll_data %>%
  mutate(date_cut = cut(Date, datecuts, labels = datecutlabels))

ocdf <- cbind(oder_cities, date_cut = factor(levels(cddf$date_cut)[6],
                                             levels = levels(cddf$date_cut)))

scale_data <- data.frame(date_cut = cddf$date_cut[1],
                         location = "bl")

cddf %>%
  ggplot() +
  geom_sf(data = river_net, color = "gray90") +
  stat_summary_hex(aes(x = X, y = Y, z = Chlorophyll_mean),
                   fun = mean, na.rm = TRUE) +
  geom_sf(data = oder_cities) +
  geom_segment(data = ocdf,
               aes(xend = Inf, yend = after_stat(y), geometry = geometry),
               color = "gray70", stat = "sf_coordinates") +
  geom_label(data = ocdf,
             aes(label = name, geometry = geometry),
             x = Inf, hjust = 1, label.size = NA,
             color = "gray70", stat = "sf_coordinates") +
  scale_fill_distiller(expression("Chl. [" * mu * "g/l]"),
                       palette = "YlGn",
                       direction = 1) +
  facet_wrap(vars(date_cut), nrow = 1) +
  ggspatial::annotation_scale(data = scale_data,
                              aes(location = location),
                              text_col = "gray70",
                              line_col = "gray80",
                              bar_cols = c("gray80", "white")) +
  theme_void() +
  theme(panel.spacing.x = unit(-3, "cm"),
        strip.text = element_text(hjust = 0))
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

Discharge data EFAS

Data

Discharge comes from the EFAS 6 hourly data. We are using the historical Version 4.0 data of river discharge in the last 6h data.

functions for accessing EFAS data

library(ncdf4)

discharge_crs <- "+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +units=m +no_defs"

get_latslons <- function(x, reproj = FALSE) {
  # get latitude longitude mask through manual reprojection because the
  # latitude/longitude data is not well defined. Testing it there are some clear
  # discrepancies. They are probably using a different ellipsoid.
  if (reproj) {
    xx <- ncvar_get(x, "x")
    yy <- ncvar_get(x, "y")
    xxyy_grid <- expand.grid(x = xx, y = yy)
    grid_lambert <- st_as_sf(xxyy_grid, coords = c("x", "y"),
                             crs = st_crs(discharge_crs))
    res <- st_transform(grid_lambert, st_crs(river_net))
    # alternatively just take the latitudes and longitudes as they are provided
    # by the data
  } else {
    xx <- ncvar_get(x, "longitude") %>% as.vector
    yy <- ncvar_get(x, "latitude") %>% as.vector
    xxyy_grid <- data.frame(x = xx, y = yy)
    res <- st_as_sf(xxyy_grid,
                    coords = c("x", "y"),
                    crs = st_crs(river_net))
  }
  # HACK: In the end neither of the above methods really worked so we just set a
  # threshold and remove everything below it. In order to keep indexing still
  # working we just move the points to a far away location.

  # the data does not really match the river data, so here we remove everything
  # with < 8 m3s-1 on the first time step.
  d <- ncvar_get(x, "dis06", count = c(-1, -1, 1))
  dmask <- as.vector(d > 8)
  res$geometry[!dmask] <- st_point(c(0, 0))
  return(res)
}

get_time <- function(x) {
  # [time]
  t_axis <- ncvar_get(x, "time")
  as.POSIXct(t_axis, origin = "1970-01-01")
}

get_data <- function(x) {
  # [x, y, time]
  ncvar_get(x, "dis06")
}

mask_margin <- function(x, mask) {
  # x: array [x, y, time]
  # mask: matrix [obs, linear[x, y]]
  dx <- dim(x)
  dim(x) <- c(dx[1] * dx[2], dx[3])
  x[mask, ]
}

Create mask for discharge data

The discharge data is relatively large when transformed into a data.frame in long form (i.e. with columns [location, time, discharge]). We need to create a mask first to filter the spatial locations we actually need.

discharge_file_names <- dir("data", "discharge_.*_cropped\\.nc$", full.names = TRUE)
discharge_nc <- lapply(discharge_file_names, nc_open)

discharge_nx <- ncvar_get(discharge_nc[[1]], "x") %>% length
discharge_ny <- ncvar_get(discharge_nc[[1]], "y") %>% length
discharge_nyear <- length(discharge_nc)

pixel_locations <- get_latslons(discharge_nc[[1]])
st_crs(pixel_locations) <- st_crs(chlorophyll_data_sf)

discharge_location_idxs_lin <- st_nearest_feature(chlorophyll_data_sf, pixel_locations)
discharge_location_idxs_arr <- arrayInd(discharge_location_idxs_lin, c(discharge_nx, discharge_ny))

discharge_nspace <- length(discharge_location_idxs_lin)

Load discharge data

Raw data

Connecting the data directly with the chlorophyll data would be overkill because we would blow up the number of rows by a factor equal to the number of time steps. There are many duplicated indices in discharge_location_idxs_lin for the calculations we only need unique locations in space.

discharge_location_idxs_lin_uniq <-
  sort(unique(discharge_location_idxs_lin))

discharge_data <- discharge_nc %>%
  lapply(function(x) {
    x2 <- get_data(x)
    mask_margin(x2, discharge_location_idxs_lin_uniq)
  })
discharge_space <- pixel_locations$geometry[discharge_location_idxs_lin_uniq]
# The time vectors can have different lengths! maybe a leap year thing?
discharge_time <- lapply(discharge_nc, get_time)

put everything into a data frame

# it would be much better to aggregate *before* transforming this into a
# data.frame. Performance is not great but okay.
discharge_df_list <- lapply(seq_len(discharge_nyear), function(i) {
  # [space, time]
  d <- discharge_data[[i]]
  # [time]
  t <- rep(discharge_time[[i]], each = dim(d)[1])
  # [space]
  s <- rep(discharge_space, dim(d)[2])
  # [space]
  i <- rep(discharge_location_idxs_lin_uniq, dim(d)[2])

  tibble(discharge = as.vector(d), time = t, geometry = s, location_idx = i)
})

discharge_df <- bind_rows(discharge_df_list)

aggregate and perform MSC and anomaly calculations.

# aggregate by day
discharge_df <- discharge_df %>%
  # as.Date(xxxxx 00:00h) seems to subtract one day
  group_by(Date = as.Date(floor_date(time, "days")) + 1, geometry, location_idx) %>%
  dplyr::summarize(discharge = mean(discharge)) %>%
  ungroup
## `summarise()` has grouped output by 'Date', 'geometry'. You can override using
## the `.groups` argument.
# mean seasonal cycle
discharge_df <- discharge_df %>%
  group_by(yday = yday(Date), location_idx) %>%
  mutate(discharge_msc = mean(discharge)) %>%
  ungroup %>%
  mutate(discharge_anomaly = discharge - discharge_msc)

# cumulative discharge and discharge anomaly since with zero at june 1
discharge_df <- discharge_df %>%
  group_by(year = year(Date), location_idx) %>%
  arrange(Date) %>%
  mutate(
    discharge_anom_cum_june =
      cumsum(discharge_anomaly) %>%
      { . - .[month(Date) == 6 & mday(Date) == 1] },
    discharge_cum_june =
      cumsum(discharge) %>%
      { . - .[month(Date) == 6 & mday(Date) == 1] }
  ) %>%
  ungroup

# percentile of cumulative discharge
discharge_df <- discharge_df %>%
  group_by(yday = yday(Date)) %>%
  mutate(
    discharge_anom_cum_june_perc =
      rank(discharge_anom_cum_june) %>%
      { . / max(.) * 100 },
    discharge_cum_june_perc =
      rank(discharge_cum_june) %>%
      { . / max(.) * 100 },
    discharge_perc =
      rank(discharge) %>%
      { . / max(.) * 100}
  ) %>%
  ungroup

Discharge Plots

do some plots for sanity check.

ggplot(discharge_df) +
  geom_line(aes(x = Date, y = discharge, group = location_idx, color = "discharge"), alpha = 0.1) +
  geom_line(aes(x = Date, y = discharge_msc, group = location_idx, color = "msc"), alpha = 0.1)

ggplot(discharge_df) +
  aes(x = Date, y = discharge_anomaly, group = location_idx) +
  geom_line(alpha = 0.5)

hist(discharge_df$discharge, main = "histogram of discharge")

hist(discharge_df$discharge_msc, main = "histogram of discharge msc")

Discharge at specific locations

In the end we are only interested in the discharge at some specific locations

River discharge a selected cities along the Oder

discharge_city_idxs <- discharge_location_idxs_lin_uniq[(st_nearest_feature(oder_cities, discharge_space))]

discharge_df %>%
  filter(location_idx %in% discharge_city_idxs) %>%
  left_join(tibble(city = oder_cities$name, location_idx = discharge_city_idxs)) %>%
  ggplot() +
  aes(x = Date, y = discharge_anomaly) +
  geom_line() +
  facet_wrap(vars(city), scales = "free_y")
## Joining with `by = join_by(location_idx)`

Zoom into 2022:

We are mostly interested in the algae bloom in summer 2022

discharge_df %>%
  filter(location_idx %in% discharge_city_idxs) %>%
  filter(year(Date) == 2022) %>%
  left_join(tibble(city = oder_cities$name, location_idx = discharge_city_idxs)) %>%
  ggplot() +
  aes(x = Date, y = discharge_anomaly) +
  geom_line() +
  facet_wrap(vars(city), scales = "free_y")
## Joining with `by = join_by(location_idx)`

discharge_df %>%
  filter(location_idx %in% discharge_city_idxs) %>%
  filter(year(Date) == 2022) %>%
  filter(month(Date) %in% 5:8) %>%
  left_join(tibble(city = oder_cities$name, location_idx = discharge_city_idxs)) %>%
  ggplot() +
  geom_line(aes(x = Date, y = discharge, color = "discharge")) +
  geom_line(aes(x = Date, y = discharge_msc, color = "msc")) +
  facet_wrap(vars(city), scales = "free_y")
## Joining with `by = join_by(location_idx)`

Cumulative discharge and discharge anomaly

prepare the cumulative discharge data

discharge_df_cities <- discharge_df %>%
  filter(location_idx %in% discharge_city_idxs) %>%
  filter((month(Date) %in% 6:7) | (month(Date) == 8 & mday(Date) < 16)) %>%
  left_join(tibble(city = oder_cities$name,
                   location_idx = discharge_city_idxs)) %>%
  # "normalize" Date by year, so that we can compare different years
  mutate(Date0 = as.Date(yday(Date), origin = "2021-12-31"))
## Joining with `by = join_by(location_idx)`

read in the measured data

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
data_eh_path <- "data/6030000_Eisenhuettenstadt_Q_TagWerte.csv"
data_hf_path <- "data/6030800_Hohensaaten-Finow_Q_TagWerte.csv"

read_measured_data <- function(path) {
  l <- readLines(path, n = 4)
  city <- l[2] %>% sub("#station_name;", "", ., fixed = TRUE)
  lat <- l[3] %>% sub("#station_latitude;", "", ., fixed = TRUE)
  lon <- l[4] %>% sub("#station_longitude;", "", ., fixed = TRUE)
  df <- read_delim(
    path,
    col_names = c("time", "discharge"),
    comment = "#",
    delim = ";",
    locale = readr::locale(tz = "Europe/Berlin"),
    show_col_type = FALSE
  )
  df <- df %>%
    mutate(X = lon,
           Y = lat,
           city = city,
           Date = lubridate::as_date(time),
           Date0 = as.Date(yday(Date), origin = "2021-12-31"),
           yday = yday(Date),
           year = year(Date)) %>%
    ## year 1962 starts in september
    filter(year != 1962) %>%
    ## ## filter to match the simulated date
    ## filter(year >= 1991) %>%
    ## we also need to do some gapfilling
    mutate(discharge = na.approx(discharge, na.rm = FALSE))

  ## na.approx doesn't fill edges!
  if(is.na(df$discharge[length(df$discharge)])) {
    df$discharge[length(df$discharge)] <- df$discharge[length(df$discharge) - 1]
  }
  if(is.na(df$discharge[1])) {
    df$discharge[2]
  }

  stopifnot(!anyNA(df$discharge))

  ## calculate discharge msc and anomaly
  df <- df %>%
    group_by(yday) %>%
    mutate(discharge_msc = mean(discharge, na.rm = TRUE)) %>%
    ungroup() %>%
    mutate(discharge_anomaly = discharge - discharge_msc)


  ## calculate cumulative discharge and anomaly
  df <- df %>%
    group_by(year) %>%
    arrange(Date) %>%
    ## we just calculate the cumulative sum over the entire year and
    ## substract the date where we want to start (1/6)
    mutate(
      discharge_anom_cum_june = cumsum(discharge_anomaly),
      discharge_anom_cum_june = discharge_anom_cum_june - discharge_anom_cum_june[month(Date) == 6 & mday(Date) == 1],
      discharge_cum_june = cumsum(discharge),
      discharge_cum_june = discharge_cum_june - discharge_cum_june[month(Date) == 6 & mday(Date) == 1]
    ) %>%
    ungroup()
  ## filter out the date range and transform int sf object
  df <- df %>%
    filter((month(Date) %in% 6:7) | (month(Date) == 8 & mday(Date) < 16)) %>%
    st_as_sf(coords = c("X", "Y"), crs = st_crs(river_net))
  return(df)
}

discharge_df_cities2 <-
  bind_rows(read_measured_data(data_eh_path),
            read_measured_data(data_hf_path),
            discharge_df_cities)

discharge_df_cities2 %>%
  filter(yday(Date) == yday(max(Date))) %>%
  arrange(-year(Date))
## Simple feature collection with 280 features and 15 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 14.14112 ymin: 50.67396 xmax: 17.88984 ymax: 53.41764
## Geodetic CRS:  ETRS89
## # A tibble: 280 × 16
##    time                discharge city          Date       Date0       yday  year
##    <dttm>                  <dbl> <chr>         <date>     <date>     <dbl> <dbl>
##  1 2022-08-15 01:00:00      75.1 Eisenhüttens… 2022-08-15 2022-08-15   227  2022
##  2 2022-08-15 01:00:00     154   Hohensaaten-… 2022-08-15 2022-08-15   227  2022
##  3 NA                      300.  Szczecin      2022-08-15 2022-08-15   227  2022
##  4 NA                      155.  Frankfurt (O… 2022-08-15 2022-08-15   227  2022
##  5 NA                      117.  Zielona Góra  2022-08-15 2022-08-15   227  2022
##  6 NA                       75.4 Wrocław       2022-08-15 2022-08-15   227  2022
##  7 NA                       62.0 Opole         2022-08-15 2022-08-15   227  2022
##  8 2021-08-15 01:00:00     182   Eisenhüttens… 2021-08-15 2022-08-15   227  2021
##  9 2021-08-15 01:00:00     295   Hohensaaten-… 2021-08-15 2022-08-15   227  2021
## 10 NA                      587.  Szczecin      2021-08-15 2022-08-15   227  2021
## # ℹ 270 more rows
## # ℹ 9 more variables: discharge_msc <dbl>, discharge_anomaly <dbl>,
## #   discharge_anom_cum_june <dbl>, discharge_cum_june <dbl>,
## #   geometry <POINT [°]>, location_idx <int>,
## #   discharge_anom_cum_june_perc <dbl>, discharge_cum_june_perc <dbl>,
## #   discharge_perc <dbl>

plot cumulative discharge data

library(ggrepel)

ggplot() +
  aes(x = Date0,
      y = discharge_cum_june,
      color = year == 2022,
      group = year,
      label = year,
      linewidth = as.numeric(year == 2022)) +
  geom_line(data = discharge_df_cities2) +
  scale_color_discrete(type = c("grey50", "red")) +
  geom_text_repel(
    data = discharge_df_cities2 %>% filter(yday(Date) == yday(max(Date))),
    segment.colour = "gray80",
    size = 3,
    direction = "y",
    nudge_x = 20,
    hjust = 0
  ) +
  scale_linewidth(range = c(0.5, 1)) +
  facet_wrap(vars(city), scales = "free_y") +
  labs(y = "Cumulative discharge [m3s-1]",
       x = "Date",
       color = "Year",
       fill = "Year") +
  theme_minimal() +
  theme(legend.justification = c(1, 0),
        legend.position = "none")
## Warning: ggrepel: 44 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 41 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggplot() +
  aes(x = Date0, y = discharge_anom_cum_june,
      color = year(Date) == 2022,
      group = year(Date),
      label = year(Date),
      linewidth = as.numeric(year(Date) == 2022)) +
  geom_line(data = discharge_df_cities2) +
  scale_color_discrete(type = c("grey50", "red")) +
  scale_linewidth() +
  geom_text_repel(
    data = discharge_df_cities %>% filter(yday(Date) == yday(max(Date))),
    color = "gray50",
    size = 3,
    direction = "y",
    nudge_x = 20,
    hjust = 0
  ) +
  scale_linewidth(range = c(0.5, 1)) +
  facet_wrap(vars(city), scales = "free_y") +
  labs(y = "Cumulative discharge anomaly [m3s-1]",
       x = "Date",
       color = "Year",
       fill = "Year") +
  theme(legend.justification = c(1, 0),
        legend.position = "none")
## Scale for linewidth is already present.
## Adding another scale for linewidth, which will replace the existing scale.
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Spatial Discharge

dddf <- discharge_df %>%
  ## filter(Date >= as.Date("2022-07-01") & Date <= as.Date("2022-08-18")) %>%
  ## mutate(Date = cut(Date, "2 days")) %>%
  mutate(date_cut = cut(Date, datecuts, labels = datecutlabels),
         st_coordinates(discharge_df$geometry) %>% as.data.frame)

ocdf <- cbind(oder_cities, date_cut = factor(levels(cddf$date_cut)[3],
                                             levels = levels(cddf$date_cut)))

dddf %>%
  filter(!is.na(date_cut)) %>%
  ggplot() +
  geom_sf(data = river_net) +
  stat_summary_hex(aes(x = X, y = Y, z = discharge_anomaly),
                   fun = mean, na.rm = TRUE) +
  ## geom_point(aes(x = X, y = Y, color = Chlorophyll_mean)) +
  geom_sf(data = oder_cities) +
  geom_segment(data = ocdf,
               aes(xend = Inf, yend = after_stat(y), geometry = geometry),
               color = "gray70", stat = "sf_coordinates") +
  geom_label(data = ocdf,
             aes(label = name, geometry = geometry),
             x = Inf, hjust = 1, label.size = NA,
             color = "gray70", stat = "sf_coordinates") +
  scale_fill_gradient2(expression(atop("Discharge anomaly",
                                       paste("[", m^3, "/s]"))),
                       high = "#313695",
                       mid = "#ffffbf",
                       low = "#a50026") +
  ## scale_fill_viridis(expression("Chl. [" * mu * "g/l]"), direction = -1) +
  facet_wrap(vars(date_cut)) +
  ## facet_wrap(vars(Date)) +
  theme_void()
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

Chlorophyll vs. discharge

Join chlorophyll data and discharge data

chlorophyll_data_sf$location_idx <- discharge_location_idxs_lin

chlorophyll_discharge_df <- full_join(chlorophyll_data_sf,
          discharge_df %>%
            select(Date,
                   discharge_anomaly, discharge, discharge_perc,
                   location_idx) %>%
            filter(Date >= min(chlorophyll_data_sf$Date) &
                     Date <= max(chlorophyll_data_sf$Date)))
## Joining with `by = join_by(Date, location_idx)`

Plot correlations

correlations on non-log data

Plot correlations 2

cddf <- chlorophyll_discharge_df %>%
  mutate(date_cut = cut(Date, datecuts, labels = datecutlabels)) 

cities_annotation <- cbind(oder_cities, date_cut = cddf$date_cut[1])

## Discharge anomaly vs. cholorphyll
ggplot(cddf) +
  geom_vline(xintercept = oder_cities$distance_from_mouth / 1000, color = "gray70") +
  geom_text(data = cities_annotation,
            aes(x = distance_from_mouth / 1000, y = Inf, label = name),
            vjust = 1.1, hjust = 0, angle = -90, color = "gray70") +
  aes(x = distance_from_mouth / 1000) +
  geom_point(aes(y = Chlorophyll_mean, color = "Chlorophyll")) +
  geom_point(aes(y = discharge_anomaly, color = "Discharge anomaly")) +
  geom_smooth(aes(y = Chlorophyll_mean), color = "#b2df8a", span = 0.2, fill = NA) +
  geom_smooth(aes(y = discharge_anomaly), color = "#a6cee3", span = 0.2, fille = NA) +
  scale_x_reverse() +
  scale_y_continuous(
    limits = c(-400, 400),
    sec.axis = sec_axis(~ ., name = expression("Discharge anomaly [" * m ^ 3 * "/s]"))
  ) +
  scale_color_discrete("", type = c("#33a02c", "#1f78b4")) +
  labs(y = expression("Chlorophyll concentration [" * mu * "g/l]"),
       x = "Distance to river mouth [km]") +
  facet_wrap(vars(date_cut), nrow = 4) +
  theme_minimal() +
  theme(legend.position = c(0.7, 0),
        legend.justification = c(0.7, 0),
        legend.title = element_blank(),
        legend.margin = margin(0, 0, 0, 0),
        legend.box.margin = margin(0, 0, 0, 0),
        legend.background = element_rect(fill = "white", color = NA))
## Warning in geom_smooth(aes(y = discharge_anomaly), color = "#a6cee3", span =
## 0.2, : Ignoring unknown parameters: `fille`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 5898 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 5945 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 5898 rows containing missing values (`geom_point()`).
## Warning: Removed 5945 rows containing missing values (`geom_point()`).

## Discharge anomaly percentile vs clorophyll concentration
ggplot(cddf) +
  geom_vline(xintercept = oder_cities$distance_from_mouth / 1000, color = "gray70") +
  geom_text(data = cities_annotation,
            aes(x = distance_from_mouth / 1000, y = Inf, label = name),
            vjust = 1.1, hjust = 0, angle = -90, color = "gray70") +
  aes(x = distance_from_mouth / 1000) +
  geom_point(aes(y = Chlorophyll_mean, color = "Chlorophyll")) +
  geom_point(aes(y = discharge_perc * 3, color = "Discharge percentile")) +
  geom_smooth(aes(y = Chlorophyll_mean), color = "#b2df8a", span = 0.2, fill = NA) +
  geom_smooth(aes(y = discharge_perc * 3), color = "#a6cee3", span = 0.2, fill = NA) +
  scale_x_reverse() +
  scale_y_continuous(
    limits = c(0, 300),
    sec.axis = sec_axis(~ . / 3, name = "Discharge anomaly percentile")
  ) +
  scale_color_discrete("", type = c("#33a02c", "#1f78b4")) +
  labs(y = expression("Chlorophyll concentration [" * mu * "g/l]"),
       x = "Distance to river mouth [km]") +
  facet_wrap(vars(date_cut), nrow = 4) +
  theme_minimal() +
  theme(legend.position = c(1, 1),
        legend.justification = c(1, 1),
        legend.title = element_blank(),
        legend.margin = margin(0, 0, 0, 0),
        legend.box.margin = margin(0, 0, 0, 0),
        legend.background = element_rect(fill = "white", color = NA))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 5922 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 5889 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 5922 rows containing missing values (`geom_point()`).
## Warning: Removed 5889 rows containing missing values (`geom_point()`).

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Manjaro Linux
## 
## Matrix products: default
## BLAS:   /home/gkraemer/progs/R/R-4.3.1/lib/libRblas.so 
## LAPACK: /usr/lib/liblapack.so.3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.utf8        LC_NUMERIC=C              
##  [3] LC_TIME=en_US.utf8         LC_COLLATE=en_US.utf8     
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.utf8    
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.9.3     zoo_1.8-12        ncdf4_1.21        ggspatial_1.1.9  
##  [5] viridis_0.6.4     viridisLite_0.4.2 lwgeom_0.2-13     igraph_1.5.1     
##  [9] tidygraph_1.2.3   sfnetworks_0.6.3  sf_1.0-14         lubridate_1.9.2  
## [13] forcats_1.0.0     stringr_1.5.0     dplyr_1.1.3       purrr_1.0.2      
## [17] readr_2.1.4       tidyr_1.3.0       tibble_3.2.1      ggplot2_3.4.3    
## [21] tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4       xfun_0.40          bslib_0.5.1        lattice_0.21-8    
##  [5] tzdb_0.4.0         vctrs_0.6.3        tools_4.3.1        generics_0.1.3    
##  [9] parallel_4.3.1     proxy_0.4-27       fansi_1.0.4        highr_0.10        
## [13] pkgconfig_2.0.3    sfheaders_0.4.3    Matrix_1.6-1       KernSmooth_2.23-22
## [17] RColorBrewer_1.1-3 lifecycle_1.0.3    compiler_4.3.1     farver_2.1.1      
## [21] munsell_0.5.0      htmltools_0.5.6    class_7.3-22       sass_0.4.7        
## [25] yaml_2.3.7         hexbin_1.28.3      pillar_1.9.0       crayon_1.5.2      
## [29] jquerylib_0.1.4    classInt_0.4-9     cachem_1.0.8       wk_0.8.0          
## [33] nlme_3.1-163       tidyselect_1.2.0   digest_0.6.33      stringi_1.7.12    
## [37] splines_4.3.1      labeling_0.4.3     fastmap_1.1.1      grid_4.3.1        
## [41] colorspace_2.1-0   cli_3.6.1          magrittr_2.0.3     utf8_1.2.3        
## [45] e1071_1.7-13       withr_2.5.0        scales_1.2.1       bit64_4.0.5       
## [49] timechange_0.2.0   rmarkdown_2.24     bit_4.0.5          gridExtra_2.3     
## [53] hms_1.1.3          evaluate_0.21      knitr_1.43         mgcv_1.9-0        
## [57] s2_1.1.4           rlang_1.1.1        Rcpp_1.0.11        glue_1.6.2        
## [61] DBI_1.1.3          vroom_1.6.3        jsonlite_1.8.7     R6_2.5.1          
## [65] units_0.8-3